In [72]:
import numpy as np
import shutil
import os
import json
import glob
from itertools import count
from scipy.sparse import coo_matrix, csr_matrix, identity, tril, triu, dia_matrix
from scipy.sparse.linalg import eigsh
import random
import matplotlib.pyplot as plt
from scipy.cluster.vq import kmeans2
from math import sqrt

In [53]:
def get_all_files(basedir, ext):
    '''
    Arguments 
    
    basedir: string
    Path to extracted dataset
    
    ext: string
    File extension (json)
    
    Returns
    
    files: ndarray
    
    filenames: dict
    Keys as filenames (without extension) and value as root path to the corresponding file
    '''
    
    filenames = {}
    files = np.array([])
    for root, _, fnames in os.walk(basedir):
        files = np.concatenate((files, glob.glob(os.path.join(root,'*'+ext))))
        for fn in fnames:
            filenames[os.path.splitext(fn)[0]] = root
    return (files, filenames)

In [54]:
def get_adjacency_matrix(files, t):
    '''
    Construct the weighted symmetric adjacency matrix in COOrdinate format
    
    Arguments
    
    files: ndarray which contains filepaths
    t: int
    threshold for similarity strength
    
    Returns
    
    M: sparse.coo_matrix
    Sparse weighting symmetric adjacency matrix
    '''
    data = []
    song_to_idx = {}
    song_count = count()
    rc2v = {}
    
    for file in files: # parse each file in files
        with open(file, 'r') as f:
            song = json.load(f) # get song of each file
            tid = song['track_id']
            if not tid in song_to_idx:
                song_to_idx[tid] = next(song_count) # assign unique id to each song
            for s in song['similars']:
                sid = s[0]
                similarity_score = s[1]
                if not sid in song_to_idx:
                    song_to_idx[sid] = next(song_count) # assign unique id to each song
                if(s[1] > t): # add edge only if the similarity score stricly greater than threshold t
                    row = song_to_idx[tid] # store valid entry in our sparse matrix at position (row,col)
                    col = song_to_idx[sid]
                    
                    if not (row,col) in rc2v: # maintain similarity scores of edges of the form (u,v) and (v,u)
                        rc2v[(row,col)] = similarity_score
                    if not (col,row) in rc2v:
                        rc2v[(col,row)] = similarity_score
                    rc2v[(row,col)] = rc2v[(col,row)] = max(rc2v[(row,col)], rc2v[(col,row)]) # matrix of an undirected graph, thus take max of the bidirectional edges
                    
    rc2v_mat = np.array([[v,k[0],k[1]] for k,v in rc2v.items()])
    W = coo_matrix((rc2v_mat[:,0], (rc2v_mat[:,1], rc2v_mat[:,2])), shape=(len(song_to_idx), len(song_to_idx))) # ensure that matrix is square
    
    return (W, song_to_idx)

In [55]:
def get_degree_matrix(M):
    '''
    Computes out degree matrix from an adjacency matrix
    
    Arguments
    
    M: sparse.xxx_matrix (where xxx can be coo, csr, csc, ...)
    Adjacency matrix
    
    Returns
    
    D: sparse.dia_matrix
    Out degree matrix of M
    '''
    degrees = M.getnnz(axis=1) # non-zero entries on each row equal the out-degree value of the nodes on each row
    D = dia_matrix((M.shape[0], M.shape[1])) # initialize sparse diagonal matrix of same size as M
    D.setdiag(degrees) # set diagonal entries to be equal to the out-degrees of each node
    
    return D

In [56]:
def get_laplacian_matrix(D, W, normalized):
    '''
    Compute Laplacian matrix
    
    Arguments
    
    D: sparse.dia_matrix
    Out degree matrix
    
    W: sparse.xxx_matrix (where xxx can be coo, csr, csc, ...)
    Sparse weighting symmetric adjacency matrix
    
    Returns
    
    L: sparse.matrix
    un-/normalized laplacian matrix
    '''
    L = D - W # un-normalized laplacian matrix
    if normalized == True:
        D = D.tocsr() # for slicing
        D = D.sqrt() # square root of a diagonal matrix corresponds to taking the square root of each diagonal element
        D[D.nonzero()[0], D.nonzero()[1]] = 1.0/D[D.nonzero()[0], D.nonzero()[1]] # avoid division by zero
        D = D.todia() # transform back to diagonal format
        L_sym = D*L*D # compute normalized laplacian
        return L_sym
    else:
        return L

In [57]:
def is_symmetric(M):
    '''
    Check whether a matrix is symmetric
    
    Arguments
    
    M: sparse.matrix
    
    Returns
    
    boolean:
    
    '''
    triu = triu(M) # upper triangular matrix of M
    tril = tril(M) # lower triangular matrix of M
    triu_nnz = triu.nonzero() # indices of nonzero values of upper triangular matrix of M
    tril_nnz = tril.nonzero() # indices of nonzero values of lower triangular matrix of M
    if any(triu_nnz[0] == tril_nnz[1]) and any(triu_nnz[1] == tril_nnz[0]):
        return True
    else:
        return False

In [58]:
def spectral_shift(M):
    '''
    Perform spectral shifting for efficient computation of 'SM' eigens
    
    Arguments
    
    M: sparse.matrix
    
    Returns
    
    B: sparse.matrix
    Spectral shifting of matrix M
    '''
    
    w_max, _ = eigsh(M, k=1) # get largest eigenvalue of matrix
    I = identity(M.shape[0]) # initialize sparse identity matrix
    B = M - w_max[0]*I # perform spectral shifting of matrix
    
    return B

In [59]:
def power_iteration(M, iter=1000):
    '''
    Computes eigenvector of M with largest absolute value
    
    Arguments
    
    M: sparse.matrix
    
    iter: int
    stopping criterion of the algorithm
    
    Returns
    
    v: ndarray
    
    Eigenvector of M with largest absolute value
    '''

    v = np.random.normal(size=(M.shape[0],1)) # initialize random vector
    norm = np.linalg.norm(v, axis=0, ord=2) # take the norm
    v = v/norm # normalize
    
    for i in range(iter):
        temp = (M*v) # store M*v to reduce computations
        v = temp/np.linalg.norm(temp, axis=0, ord=2)
        
    return v

In [60]:
def get_eigens(M, k, spectral, which, tol):
    '''
    Compute eigens of a sparse matrix
    
    Arguments
    
    M: sparse.matrix
    
    k: int
    number of eigenvalues
    
    spectral: boolean
    Whether to compute the eigenvalues of the spectral shifted matrix
    
    which: string 
    See sparse.linalg.eigsh docs
    
    tol: double
    See sparse.linalg.eigsh docs
    
    Returns
    
    w: ndarray
    k eigenvalues
    
    v: ndarray
    k eigenvectors
    '''
    if spectral == True:
        B = spectral_shift(M) # perform spectral shift for efficient k smallest eigenvalue computation
        w, v = eigsh(B, k, which=which, tol=tol)
        return w, v
    else:
        w, v = eigsh(M, k, which=which, tol=tol)
        return w, v

In [61]:
def cluster(data, k):
    '''
    Performs standard k-Means clustering algorithm
    
    Arguments
    
    data: ndarray
    A ‘M’ by ‘N’ array of ‘M’ observations in ‘N’ dimensions or a 
    length ‘M’ array of ‘M’ one-dimensional observations.
    
    k: int
    The number of clusters to form as well as the number of centroids to generate.
    
    Returns
    
    cent: ndarray
    k centroids
    
    var: ndarray
    labels of data assigned to k centroids
    '''
    cent, var = kmeans2(data, k)
    
    return (cent, var)

In [62]:
def spectral_clustering(eigens, k):
    '''
    Performs spectral clustering on data
    
    Arguments
    
    data: ndarray
    filepaths to instances
    
    k: int
    number of clusters
    
    Returns 
    
    var: ndarray
    assigned labels to instances
    '''
    var = cluster(eigens, k)[1]
    
    return var

In [63]:
def plot_data(data, k):
    '''
    Plots histograms of data into k bins
    
    Arguments
    
    data: ndarray
    
    k: int
    number of bins
    '''
    
    hist, bins = np.histogram(data, bins=k, normed=False)
    width = 0.2 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center, hist, align='center', width=width)
    plt.show()
    return hist

(a) Construct a weighted symmetric matrix W


(b) Perform spectral clustering on the graph


Check implementation of

  • get_adjacency_matrix(files, t=0)
  • spectral_clustering(data, k, which, tol, normalized)

(c) Calculate the ratio-cut corresponding to the minimal value according to the constrained relaxation version of the problem.


From Lecture we know that minimizing ratio cut for k > 2 is equivalent to solving following minimization problem

$\min{C_1, ..., C_k} [(RatioCut(C_1, ..., C_k)] = \min{C_1, ..., C_k} [trace(H^T L H)]$ subject to $H^T H = Id$

For the constrained relaxation version of the problem we consider arbitrary values for $H$, thus

$\min{H \in R^{Vxk}} [trace(H^T L H)]$ subject to $H^T H = Id$

This is a standard form of a trace minimization problem, for which we know from lecture that the solution is given by choosing $H$ as the matrix which contains the first $k$ eigenvectors of $L$ as columns.


In [82]:
def soft_min_ratio_cut(H):
    '''
    Computes constrained relaxation min cut ratio, i.e. the indicator matrix can have any values
    
    Arguments
    
    H: ndarray (or sparse matrix)
    Indicator matrix
    
    Returns
    
    mrc: float
    
    Minimum Ratio Cut
    '''
    H = csr_matrix(H)
    mrc = (H.transpose()*L*H).diagonal().sum()
    
    return mrc

(d) Calculate the ratio-cut given the hard assignment clustering you obtained in b).



In [83]:
def min_ratio_cut(labels, L, k):
    '''
    Calculate min ratio-cut to graph given by data. It can be normalied or un-normalized.
    
    Arguments
    
    labels: ndarray 
    Labels of instances computed e.g. by k-Means 
    k: int
    number of clusters
    
    Returns
    
    mrc: float
    Minimum Ratio Cut
    '''
    
    H = []
    for i in range(k):
        indices = np.argwhere(labels == i)
        h = np.zeros_like(labels, dtype=np.float64)
        h[indices[:,0]] = 1.0/sqrt(indices.shape[0])
        H.append(h)
    H = np.asarray(H)
    H = csr_matrix(H)
    mrc = (H*L*H.transpose()).diagonal().sum()
    
    return mrc

(e) Visualize the distribution of tags for each cluster by plotting a histogram of tags per cluster.



In [84]:
def get_songs_to_tags_matrix(files, song_to_row, g):
    '''
    Construct the songs-to-tag matrix in COOordinate format
    
    Arguments
    
    tags_to_col: 
    dict where key is tag and value is column index from matrix N
    g: 
    threshold for tag strength
    
    Return
    N: 
    songs-to-tags matrix
    
    '''
    row = []
    col = []
    data = []
    tags_to_col = {}
    col_count = count()
    
    for file in files:
        with open(file, 'r') as f:
            song = json.load(f)
            tid = song['track_id']
            for t in song['tags']:
                tag = t[0]
                cnt  = t[1]
                if not tag in tags_to_col:
                    tags_to_col[tag] = next(col_count)
                if(int(cnt) >= g):
                    row.append(song_to_row[tid])
                    col.append(tags_to_col[tag])
                    data.append(1)
    N = coo_matrix((data, (row, col)), shape=(len(song_to_row), len(tags_to_col)))
    del row, col, data, col_count
    return (N, tags_to_col)

In [67]:
# TODO Implement k-figure with k plots
def plot_tags_per_cluster(labels, s2t, k):
    for i in range(k):
        class_ids = np.argwhere(labels == i)
        s2t = s2t.tocsr()
        tag_dist = N[cluster_1[:,0],:].sum(axis=0)
        plot_data(tag_dist, N.shape[1])

Main


Defines main entry point of the program. Here are all hyper-/parameters defined.


In [68]:
basedir = 'lastfm_subset'
ext = '.json'
t = 0
normalized = False
k = 10
spectral = True
which = 'LM'
tol = 10e-2
g = 50

In [69]:
files = get_all_files(basedir, ext)[0] # get all filepaths in the specified dataset directory (='lastfm_subset')

In [70]:
W, song_indices = get_adjacency_matrix(files, t) # get weighted symmetric adjacency matrix and song indices

In [73]:
D = get_degree_matrix(W) # get out-degree matrix

In [74]:
L = get_laplacian_matrix(D, W, normalized) # get laplacian matrix

In [78]:
w, v = get_eigens(L, k, spectral, which, tol) # get k smalles eigenvalues and corresponding eigenvectors

In [79]:
labels = spectral_clustering(v, k) # get spectral clustering of data graph

In [80]:
plot_data(labels, k) # plot distribution of labels to k clusters


Out[80]:
array([ 1113,  2573,   780, 41666,  3909,   510, 19481,  1117, 97909, 37447])

In [27]:
soft_mrc = soft_min_ratio_cut(v) # perform soft min ratio-cut on data graph with k clusters

In [28]:
mrc = min_ratio_cut(labels, L, k) # perform min ratio-cut on data graph with k clusters

In [29]:
s2t, t2c = get_songs_to_tags_matrix(files, song_indices, g) # compute song_to_tags matrix and tags_to_column_indices

In [ ]:
plot_tags_per_cluster()